Modelling the Serie A football data

Statistical Methods in Data Science II & Lab

Domenico Mattia Cinque 1784965
July 20 2021

Introduction

In this project we try to replicate and extend the example 7.2 in Bayesan Modelling Using WinBUGS. The data are the one related to the season 2020-2021 of the Italian Serie A. We have \(K=20\) different teams and \(N = K(K-1)=380\) records of football matches along the season. We will consider only a few columns of the original dataset:


data <- read.csv('I1.csv')

df <- select(data, 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG')

df$HomeTeam <- as.factor(df$HomeTeam)
df$AwayTeam <- as.factor(df$AwayTeam)
teams <- sort(unique(df$HomeTeam))

str(df)

'data.frame':   380 obs. of  4 variables:
 $ HomeTeam: Factor w/ 20 levels "Atalanta","Benevento",..: 6 20 13 7 16 9 11 18 4 15 ...
 $ AwayTeam: Factor w/ 20 levels "Atalanta","Benevento",..: 18 14 12 5 4 15 3 1 10 2 ...
 $ FTHG    : int  1 0 0 4 1 3 2 2 0 2 ...
 $ FTAG    : int  0 0 2 1 1 0 0 4 2 3 ...

The Model

In this model proposed by Maher (1982) we denote by \(y_{ij}\) the number of goals scored in the \(i\)-th match by the home teams if \(j=1\) or the away teams if \(j=2\). The model is called Poisson log-linear since the link function is logarithmic: \[ \begin{align*} Y_{ij}&\sim \text{Pois}(\lambda_{ik}) \quad & \text{for} \;j=1,2 \quad k=1,\dots ,K \\ \log(\lambda_{i1}) & =\mu+\text{home} +a_{HT_i}+d_{AT_i} \\ \log(\lambda_{i2}) &= \mu + a_{AT_i}+d_{HT_i} \quad& \text{for} \; i=1,\dots,N \end{align*}\] where \(\text{home}\) is a parameter that takes in to account the advantage of playing home, \(a_k\) and \(d_k\) are respectively attacking and defensive parameters for each team. On these two we use sum-to-zero constraints: \[ \sum_{k=1}^K a_k = 0 \quad \sum_{k=1}^K d_k=0\] Notice that an \(a>0\) means that the team scores more than the average, while for \(d\) is the opposite: good defensive performances are related to negative defensive parameters. We choose for the parameter the same normal prior distribution with high variance

\[\mu,\text{home},a_k,d_k \sim \mathcal N(0, 1\times10^4) \quad k=1,\dots,K \]


model <-function(){
  # Likelihood
  for (i in 1:N){
    goals1[i] ~ dpois( lambda1[i] )
    goals2[i] ~ dpois( lambda2[i] )
    
    # Link
    log(lambda1[i]) <- mu + home + a[ ht[i] ] + d[ at[i] ]
    log(lambda2[i]) <- mu        + a[ at[i] ] + d[ ht[i] ]
  }
  
  #  Sum-to-zero constraints
  a[1] <- -sum( a[2:K] )
  d[1] <- -sum( d[2:K] )
  
  # Priors  
  mu   ~ dnorm(0, 1.0E-4)
  home ~ dnorm(0, 1.0E-4)
  
  for (i in 2:K){
    a[i] ~ dnorm(0, 1.0E-4)
    d[i] ~ dnorm(0, 1.0E-4)
  }
  
  # League Replication
  for (i in 1:K){
    for (j in 1:K){
      # In this case we are simulating also the games played
      # against itself (i=j). Those will be removed later
      goals1.rep[i,j] ~ dpois(lambda1.rep[i,j])
      goals2.rep[i,j] ~ dpois(lambda2.rep[i,j])
      
      # Here we use the parameters found before
      log(lambda1.rep[i,j]) <-  mu + home + a[ i ] + d[ j ]
      log(lambda2.rep[i,j]) <-  mu        + a[ j ] + d[ i ]
      
      # replicated difference
      goal.diff.rep[i,j] <- goals1.rep[i,j]-goals2.rep[i,j] 
    }
  }
  for (i in 1:K){
    for (j in 1:K){
      # points earned by each home team (i)
      ## step(x) = 1 if x >= 0, 0 otherwise
      ## equals(x,y) = 1 if x == y
      points1[i,j] <- 3*(1-step(-goal.diff.rep[i,j])) +
        1*equals(goal.diff.rep[i,j],0)
      # points earned by each away team (j)
      points2[i,j] <- 3*(1-step( goal.diff.rep[i,j])) +
        1*equals(goal.diff.rep[i,j],0)
    }
  }
  for (i in 1:K){
    # Sum of the points in home games + sum of away games
    # minus the games with itself
    total.points[i] <- sum(points1[i,1:K]) - points1[i,i] +
      sum(points2[1:K,i]) - points2[i,i] 
  }
  
  # Ranking 
  for (i in 1:K){ 
    # The rank for team i is 21 - the number of teams that have 
    # less point than i - 1 (itself)
    ranks[i] <- K + 1 - sum(total.points[i] >= total.points) - 1
  }
  for (i in 1:K){
    for (j in 1:K){
      
      rank.probs[i,j] <- equals( ranks[i], j )
    }
  } 
}


N <- nrow(df)
K <- length(teams) 
  
dat.jags <- list("goals1" = df$FTHG, "goals2" = df$FTAG, 
                 "ht" = df$HomeTeam, "at" = df$AwayTeam,
                 "N" = N, "K" = K)

mod.params <- c("mu", "home", "a", "d", "ranks", 
                "rank.probs","total.points",
                "goals1.rep", "goals2.rep")
 

mod.fit <- jags(model.file = model,
                n.chains = 3,
                data = dat.jags,
                parameters.to.save = mod.params,
                n.iter = 9000, n.burnin = 1000, n.thin=10)

Compiling model graph
   Resolving undeclared variables
   Allocating nodes
Graph information:
   Observed stochastic nodes: 760
   Unobserved stochastic nodes: 840
   Total graph size: 9412

Initializing model

results <- data.frame(teams,
                  round(mod.fit$BUGSoutput$mean$a,3),
                  round(mod.fit$BUGSoutput$mean$d,3),
                  round(mod.fit$BUGSoutput$mean$ranks,3),
                  round(mod.fit$BUGSoutput$mean$total.points,3))

colnames(results) <- c('teams', 'a', 'd',
                       'rank', 'scores')

actual.scores <- c(78, 33, 41, 37, 23, 40, 42, 91, 78, 68, 79, 77,
                   20, 62, 52, 62, 39, 37, 40, 45)

results$real.scores <- actual.scores

ord <- sort(results$real.scores, decreasing = T, index.return=T)
results <- results[ord$ix,]

results$abs.score.err <- round(abs(results$scores -
                                     results$real.scores),3)

row.names(results) <- NULL
results

        teams      a      d   rank scores real.scores abs.score.err
1       Inter  0.447 -0.452  1.224 83.219          91         7.781
2       Milan  0.268 -0.306  3.373 72.265          79         6.735
3    Atalanta  0.473 -0.148  2.426 76.525          78         1.475
4    Juventus  0.303 -0.381  2.581 75.956          78         2.044
5      Napoli  0.421 -0.301  2.085 78.248          77         1.248
6       Lazio  0.086 -0.023  7.758 56.282          68        11.718
7        Roma  0.191 -0.013  6.451 60.363          62         1.637
8    Sassuolo  0.138  0.003  7.210 57.885          62         4.115
9   Sampdoria -0.085 -0.045  9.529 50.955          52         1.045
10     Verona -0.272 -0.174 10.208 49.015          45         4.015
11      Genoa -0.180  0.017 11.360 45.822          42         3.822
12    Bologna -0.075  0.142 11.627 45.125          41         4.125
13 Fiorentina -0.165  0.047 11.530 45.307          40         5.307
14    Udinese -0.287  0.021 12.652 42.224          40         2.224
15     Spezia -0.058  0.243 12.772 42.055          39         3.055
16   Cagliari -0.267  0.039 12.525 42.625          37         5.625
17     Torino -0.100  0.205 12.755 42.060          37         5.060
18  Benevento -0.316  0.271 15.728 33.447          33         0.447
19    Crotone -0.179  0.484 16.679 29.958          23         6.958
20      Parma -0.342  0.371 16.873 29.232          20         9.232

We can compare the simulated ranking with the actual ranking using the Kendall’s tau


simulated.rank <- rank(results$scores)[K:1]
actual.rank <- seq(1,20,1)
cor(simulated.rank, actual.rank, method = 'kendall')

[1] 0.8842105

Model Diagnostics

In order to have a sense of how the chains have converged, we can use the Geweke’s convergence diagnostic. We plot below the parameters with best and worse convergence according to this diagnostic, together with their auto correlation plots. In general, \(|Z|>2\) means that the chain has not converged very well, and that seems to be the case for the second parameter. However, we can see in the auto correlation plots that the ac is stable around 0 for both parameters (good sign).

Below we show an example of parameter with high auto correlation and one with low. In order to catch them we compare their effective sample size.


ess <- effectiveSize(coda.fit)[1:41]

low <- names(which(ess[1:41] == max(ess[1:41])))
high <- names(which(ess[1:41] == min(ess[1:41])))

bayesplot::mcmc_acf(chainMat[,c(low, high)])

The parameter d[8] (on the right) is the one with smallest ess, and as we can see its autocorrelation is almost always positive.

Prediction

We can see in the following the HPD intervals for some of the parameters. It is interesting to notice how, in the last plot, the first 5 teams (European competitions zone) seems to have a clear separation from the others in terms of points. This happens (in the opposite sense) also for the last 3 teams (demotion to Serie B zone).

We might be interested also in seeing if the parameter \(\text{home}\) is significantly greater than 0 (i.e: there is a true advantage in playing as home team). We can do this by performing a classical hypothesis testing procedure:

\[\begin{cases} H_0: \text{home}\ge0 \\H_1:\text{home}<0\end{cases}\] In order to do this we compute the Bayes factor


mu.home <- mod.fit$BUGSoutput$mean$home
sd.home <- mod.fit$BUGSoutput$sd$home

ph0_prior <-  pnorm(0, mean=0, sd=1.0E-4, lower.tail = F)
ph1_prior <-  pnorm(0, mean=0, sd=1.0E-4)
prior_odds <- ph0_prior/ph1_prior

ph0_post <-  pnorm(0, mean=mu.home, sd=sd.home, lower.tail = F)
ph1_post <-  pnorm(0, mean=mu.home, sd=sd.home)
post_odds <- ph0_post/ph1_post

bf <- round(post_odds/prior_odds,1)
bf

[1] 72.3

A Bayes factor of 72.3 means strong support in favor of \(H_0\), so we can conclude that there is a relevant advantage in playing as home team under our model assumptions.

Recover parameters with simulated data

In this part of the analysis we try to use the same model on the simulated data from before, in order to check if we are able to recover the true (in the simulated world) parameters.


## Build data
goal1.mat <- mod.fit$BUGSoutput$mean$goals1.rep
rownames(goal1.mat) <- teams
colnames(goal1.mat) <- teams
goal1.df <- melt(goal1.mat)
colnames(goal1.df) = c('HomeTeam', 'AwayTeam', 'FTHG')

goal2.mat <- mod.fit$BUGSoutput$mean$goals2.rep
rownames(goal2.mat) <- teams
colnames(goal2.mat) <- teams
goal2.df <- melt(goal2.mat)
colnames(goal2.df) = c('HomeTeam', 'AwayTeam', 'FTAG')

df.final <- data.frame(goal1.df$HomeTeam, goal1.df$AwayTeam, 
                       goal1.df$FTHG, goal2.df$FTAG)
colnames(df.final)<- c('HomeTeam', 'AwayTeam', 'FTHG', 'FTAG')
df.final <- df.final[(df.final$HomeTeam != df.final$AwayTeam),]

# Goals must be integer in order to use Poisson
df.final$FTHG <- round(df$FTHG, 1)
df.final$FTAG <- round(df$FTAG, 1)

## Model 
model.new <-function(){
  ## sampling
  for (i in 1:N){
    goals1[i] ~ dpois( lambda1[i] )
    goals2[i] ~ dpois( lambda2[i] )
    
    log(lambda1[i]) <- mu + home + a[ ht[i] ] + d[ at[i] ]
    log(lambda2[i]) <- mu        + a[ at[i] ] + d[ ht[i] ]
  }
  
  #  Sum-to-zero constraints
  a[1] <- -sum( a[2:K] )
  d[1] <- -sum( d[2:K] )
  
  mu   ~ dnorm(0, 1.0E-4)
  home ~ dnorm(0, 1.0E-4)
  
  for (i in 2:K){
    a[i] ~ dnorm(0, 1.0E-4)
    d[i] ~ dnorm(0, 1.0E-4)
  }
}

dat.jags <- list("goals1" = df.final$FTHG, "goals2" = df.final$FTAG, 
                 "ht" = df.final$HomeTeam, "at" = df.final$AwayTeam,
                 "N" = N, "K" = K)

mod.params <- c("mu", "home", "a", "d")


mod.fit.sim <- jags(model.file = model.new,
                n.chains = 3,
                data = dat.jags,
                parameters.to.save = mod.params,
                n.iter = 9000, n.burnin = 1000, n.thin=10)

Compiling model graph
   Resolving undeclared variables
   Allocating nodes
Graph information:
   Observed stochastic nodes: 760
   Unobserved stochastic nodes: 40
   Total graph size: 3090

Initializing model

results.2 <- data.frame(teams,
                  round(mod.fit.sim$BUGSoutput$mean$a,3),
                  round(mod.fit.sim$BUGSoutput$mean$d,3),
                  round(mod.fit$BUGSoutput$mean$a,3),
                  round(mod.fit$BUGSoutput$mean$d,3))

colnames(results.2 ) <- c('teams', 'a', 'd', 'a.new', 'd.new')

results.2$a.abs.diff <- abs(results.2$a - results.2$a.new)
results.2$b.abs.diff <- abs(results.2$d - results.2$d.new)

results.2

        teams      a      d  a.new  d.new a.abs.diff b.abs.diff
1    Atalanta  0.029  0.078  0.473 -0.148      0.444      0.226
2   Benevento  0.067  0.099 -0.316  0.271      0.383      0.172
3     Bologna  0.256  0.071 -0.075  0.142      0.331      0.071
4    Cagliari -0.128 -0.217 -0.267  0.039      0.139      0.256
5     Crotone -0.076 -0.222 -0.179  0.484      0.103      0.706
6  Fiorentina  0.195  0.020 -0.165  0.047      0.360      0.027
7       Genoa -0.197  0.016 -0.180  0.017      0.017      0.001
8       Inter  0.029 -0.027  0.447 -0.452      0.418      0.425
9    Juventus -0.126 -0.107  0.303 -0.381      0.429      0.274
10      Lazio -0.190 -0.065  0.086 -0.023      0.276      0.042
11      Milan  0.047  0.039  0.268 -0.306      0.221      0.345
12     Napoli -0.129 -0.019  0.421 -0.301      0.550      0.282
13      Parma -0.096  0.041 -0.342  0.371      0.246      0.330
14       Roma  0.091  0.081  0.191 -0.013      0.100      0.094
15  Sampdoria  0.006 -0.012 -0.085 -0.045      0.091      0.033
16   Sassuolo -0.092 -0.042  0.138  0.003      0.230      0.045
17     Spezia  0.124  0.175 -0.058  0.243      0.182      0.068
18     Torino  0.121  0.112 -0.100  0.205      0.221      0.093
19    Udinese  0.237  0.155 -0.287  0.021      0.524      0.134
20     Verona -0.168 -0.176 -0.272 -0.174      0.104      0.002

In the table above we compare only the attack and defense parameters. We can see from the columns of the absolute differences that the model can recover the original parameter quite decently, since they are all \(<1\).

Alternative Model

The Poisson distribution has the property of having mean and variance both equal to its parameter \(\lambda\). If we want to model the variance of the response variable independently from its mean we could consider the Negative Binomial Distribution: \[\begin{align} Y_{ij} & \sim \text{NB}(p_{ik},r_j) \quad & \text{for} \;j=1,2 \quad k=1,\dots ,K \\ p_{ij} &= \frac{r_j}{r_j+\lambda_{ik}} \\ \log(\lambda_{i1}) & =\mu+\text{home} +s_{HT_i}-s_{AT_i} \\ \log(\lambda_{i2}) &= \mu + s_{AT_i}-s_{HT_i} \quad& \text{for} \; i=1,\dots,N \end{align}\]

We also substituted the attack and defense parameters with an overall strength score \(s_k\) (with the usual STZ constraint). The prior distributions for the parameters are \[ \begin{align} \mu,\text{home},s_k &\sim \mathcal N(0, 1\times10^4) &\quad k=1,\dots,K \\ r_j&\sim\text{Unif}(0,50) &\quad j=1,2\end{align}\]


model.alt <-function(){
  ## sampling
  for (i in 1:N){
    goals1[i] ~ dnegbin( p1[i], r1 )
    goals2[i] ~ dnegbin( p2[i], r2 )
    
    p1[i] <- r1/(r1 + lambda1[i])
    p2[i] <- r2/(r2 + lambda2[i])
    
    log(lambda1[i]) <- mu + home + s[ ht[i] ] - s[ at[i] ]
    log(lambda2[i]) <- mu        + s[ at[i] ] - s[ ht[i] ]
  }
  
  #  Sum-to-zero constraints
  s[1] <- -sum( s[2:K] )
  
  mu   ~ dnorm(0, 1.0E-4)
  home ~ dnorm(0, 1.0E-4)
  
  r1 ~ dunif(0,50)
  r2 ~ dunif(0,50)
  
  for (i in 2:K){
    s[i] ~ dnorm(0, 1.0E-4)
  }
}

N <- nrow(df)
K <- length(teams) 

dat.jags <- list("goals1" = df$FTHG, "goals2" = df$FTAG, 
                 "ht" = df$HomeTeam, "at" = df$AwayTeam,
                 "N" = N, "K" = K)

mod.params <- c("mu", "home", "s", "r1", "r2")

mod.fit.alt <- jags(model.file = model.alt,
                n.chains = 3,
                data = dat.jags,
                parameters.to.save = mod.params,
                n.iter = 9000, n.burnin = 1000, n.thin=10)

Compiling model graph
   Resolving undeclared variables
   Allocating nodes
Graph information:
   Observed stochastic nodes: 760
   Unobserved stochastic nodes: 23
   Total graph size: 4631

Initializing model

         team    strength
8       Inter  0.44436652
12     Napoli  0.37328702
1    Atalanta  0.35477364
9    Juventus  0.32553582
11      Milan  0.27665645
14       Roma  0.11367771
16   Sassuolo  0.06785479
10      Lazio  0.05046760
15  Sampdoria -0.01637843
20     Verona -0.04081226
7       Genoa -0.09221732
6  Fiorentina -0.10239836
3     Bologna -0.12026822
19    Udinese -0.13420904
4    Cagliari -0.13555488
18     Torino -0.15759699
17     Spezia -0.16832524
2   Benevento -0.29062082
13      Parma -0.36316077
5     Crotone -0.38507722

In order to compare the two models we can use the deviance information criterion (DIC): for the Poisson model we have a DIC value of 2332.78, while for the Negative Binomial Model the value is 2286.8. According to this measure, the model that fits better seems to be the Negative Binomial, in fact, even if the mean deviance is lower for the first model (2258.67 vs 2263.06), the number of parameters of the latter is much smaller (23.73 vs 74.11) and the DIC penalizes the higher number of parameters.